Loading both Train and Test data

Train <- fread("./application_train.csv")
#Test<-fread("./application_test.csv")
#pre_app<-fread("./previous_application.csv")
#bur<-fread("./bureau.csv")
#card_bal<-fread("./credit_card_balance.csv")
#payment<-fread("./installments_payments.csv")
#P<-fread("./POS_CASH_balance.csv")
#bur_bal<-fread("./bureau_balance.csv")

Observing Train data attributes

#List variables in train data
#names(Train)
# list the structure of Train data
glimpse(Train)
## Observations: 307,511
## Variables: 122
## $ SK_ID_CURR                   <int> 100002, 100003, 100004, 100006, 1...
## $ TARGET                       <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ NAME_CONTRACT_TYPE           <chr> "Cash loans", "Cash loans", "Revo...
## $ CODE_GENDER                  <chr> "M", "F", "M", "F", "M", "M", "F"...
## $ FLAG_OWN_CAR                 <chr> "N", "N", "Y", "N", "N", "N", "Y"...
## $ FLAG_OWN_REALTY              <chr> "Y", "N", "Y", "Y", "Y", "Y", "Y"...
## $ CNT_CHILDREN                 <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, ...
## $ AMT_INCOME_TOTAL             <dbl> 202500.00, 270000.00, 67500.00, 1...
## $ AMT_CREDIT                   <dbl> 406597.5, 1293502.5, 135000.0, 31...
## $ AMT_ANNUITY                  <dbl> 24700.5, 35698.5, 6750.0, 29686.5...
## $ AMT_GOODS_PRICE              <dbl> 351000, 1129500, 135000, 297000, ...
## $ NAME_TYPE_SUITE              <chr> "Unaccompanied", "Family", "Unacc...
## $ NAME_INCOME_TYPE             <chr> "Working", "State servant", "Work...
## $ NAME_EDUCATION_TYPE          <chr> "Secondary / secondary special", ...
## $ NAME_FAMILY_STATUS           <chr> "Single / not married", "Married"...
## $ NAME_HOUSING_TYPE            <chr> "House / apartment", "House / apa...
## $ REGION_POPULATION_RELATIVE   <dbl> 0.018801, 0.003541, 0.010032, 0.0...
## $ DAYS_BIRTH                   <int> -9461, -16765, -19046, -19005, -1...
## $ DAYS_EMPLOYED                <int> -637, -1188, -225, -3039, -3038, ...
## $ DAYS_REGISTRATION            <dbl> -3648, -1186, -4260, -9833, -4311...
## $ DAYS_ID_PUBLISH              <int> -2120, -291, -2531, -2437, -3458,...
## $ OWN_CAR_AGE                  <dbl> NA, NA, 26, NA, NA, NA, 17, 8, NA...
## $ FLAG_MOBIL                   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ FLAG_EMP_PHONE               <int> 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...
## $ FLAG_WORK_PHONE              <int> 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, ...
## $ FLAG_CONT_MOBILE             <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ FLAG_PHONE                   <int> 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, ...
## $ FLAG_EMAIL                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ OCCUPATION_TYPE              <chr> "Laborers", "Core staff", "Labore...
## $ CNT_FAM_MEMBERS              <dbl> 1, 2, 1, 2, 1, 2, 3, 2, 2, 1, 3, ...
## $ REGION_RATING_CLIENT         <int> 2, 1, 2, 2, 2, 2, 2, 3, 2, 2, 2, ...
## $ REGION_RATING_CLIENT_W_CITY  <int> 2, 1, 2, 2, 2, 2, 2, 3, 2, 2, 2, ...
## $ WEEKDAY_APPR_PROCESS_START   <chr> "WEDNESDAY", "MONDAY", "MONDAY", ...
## $ HOUR_APPR_PROCESS_START      <int> 10, 11, 9, 17, 11, 16, 16, 16, 14...
## $ REG_REGION_NOT_LIVE_REGION   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ REG_REGION_NOT_WORK_REGION   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ LIVE_REGION_NOT_WORK_REGION  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ REG_CITY_NOT_LIVE_CITY       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ REG_CITY_NOT_WORK_CITY       <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...
## $ LIVE_CITY_NOT_WORK_CITY      <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...
## $ ORGANIZATION_TYPE            <chr> "Business Entity Type 3", "School...
## $ EXT_SOURCE_1                 <dbl> 0.08303697, 0.31126731, NA, NA, N...
## $ EXT_SOURCE_2                 <dbl> 0.2629486, 0.6222458, 0.5559121, ...
## $ EXT_SOURCE_3                 <dbl> 0.13937578, NA, 0.72956669, NA, N...
## $ APARTMENTS_AVG               <dbl> 0.0247, 0.0959, NA, NA, NA, NA, N...
## $ BASEMENTAREA_AVG             <dbl> 0.0369, 0.0529, NA, NA, NA, NA, N...
## $ YEARS_BEGINEXPLUATATION_AVG  <dbl> 0.9722, 0.9851, NA, NA, NA, NA, N...
## $ YEARS_BUILD_AVG              <dbl> 0.6192, 0.7960, NA, NA, NA, NA, N...
## $ COMMONAREA_AVG               <dbl> 0.0143, 0.0605, NA, NA, NA, NA, N...
## $ ELEVATORS_AVG                <dbl> 0.00, 0.08, NA, NA, NA, NA, NA, N...
## $ ENTRANCES_AVG                <dbl> 0.0690, 0.0345, NA, NA, NA, NA, N...
## $ FLOORSMAX_AVG                <dbl> 0.0833, 0.2917, NA, NA, NA, NA, N...
## $ FLOORSMIN_AVG                <dbl> 0.1250, 0.3333, NA, NA, NA, NA, N...
## $ LANDAREA_AVG                 <dbl> 0.0369, 0.0130, NA, NA, NA, NA, N...
## $ LIVINGAPARTMENTS_AVG         <dbl> 0.0202, 0.0773, NA, NA, NA, NA, N...
## $ LIVINGAREA_AVG               <dbl> 0.0190, 0.0549, NA, NA, NA, NA, N...
## $ NONLIVINGAPARTMENTS_AVG      <dbl> 0.0000, 0.0039, NA, NA, NA, NA, N...
## $ NONLIVINGAREA_AVG            <dbl> 0.0000, 0.0098, NA, NA, NA, NA, N...
## $ APARTMENTS_MODE              <dbl> 0.0252, 0.0924, NA, NA, NA, NA, N...
## $ BASEMENTAREA_MODE            <dbl> 0.0383, 0.0538, NA, NA, NA, NA, N...
## $ YEARS_BEGINEXPLUATATION_MODE <dbl> 0.9722, 0.9851, NA, NA, NA, NA, N...
## $ YEARS_BUILD_MODE             <dbl> 0.6341, 0.8040, NA, NA, NA, NA, N...
## $ COMMONAREA_MODE              <dbl> 0.0144, 0.0497, NA, NA, NA, NA, N...
## $ ELEVATORS_MODE               <dbl> 0.0000, 0.0806, NA, NA, NA, NA, N...
## $ ENTRANCES_MODE               <dbl> 0.0690, 0.0345, NA, NA, NA, NA, N...
## $ FLOORSMAX_MODE               <dbl> 0.0833, 0.2917, NA, NA, NA, NA, N...
## $ FLOORSMIN_MODE               <dbl> 0.1250, 0.3333, NA, NA, NA, NA, N...
## $ LANDAREA_MODE                <dbl> 0.0377, 0.0128, NA, NA, NA, NA, N...
## $ LIVINGAPARTMENTS_MODE        <dbl> 0.0220, 0.0790, NA, NA, NA, NA, N...
## $ LIVINGAREA_MODE              <dbl> 0.0198, 0.0554, NA, NA, NA, NA, N...
## $ NONLIVINGAPARTMENTS_MODE     <dbl> 0.0000, 0.0000, NA, NA, NA, NA, N...
## $ NONLIVINGAREA_MODE           <dbl> 0.0000, 0.0000, NA, NA, NA, NA, N...
## $ APARTMENTS_MEDI              <dbl> 0.0250, 0.0968, NA, NA, NA, NA, N...
## $ BASEMENTAREA_MEDI            <dbl> 0.0369, 0.0529, NA, NA, NA, NA, N...
## $ YEARS_BEGINEXPLUATATION_MEDI <dbl> 0.9722, 0.9851, NA, NA, NA, NA, N...
## $ YEARS_BUILD_MEDI             <dbl> 0.6243, 0.7987, NA, NA, NA, NA, N...
## $ COMMONAREA_MEDI              <dbl> 0.0144, 0.0608, NA, NA, NA, NA, N...
## $ ELEVATORS_MEDI               <dbl> 0.00, 0.08, NA, NA, NA, NA, NA, N...
## $ ENTRANCES_MEDI               <dbl> 0.0690, 0.0345, NA, NA, NA, NA, N...
## $ FLOORSMAX_MEDI               <dbl> 0.0833, 0.2917, NA, NA, NA, NA, N...
## $ FLOORSMIN_MEDI               <dbl> 0.1250, 0.3333, NA, NA, NA, NA, N...
## $ LANDAREA_MEDI                <dbl> 0.0375, 0.0132, NA, NA, NA, NA, N...
## $ LIVINGAPARTMENTS_MEDI        <dbl> 0.0205, 0.0787, NA, NA, NA, NA, N...
## $ LIVINGAREA_MEDI              <dbl> 0.0193, 0.0558, NA, NA, NA, NA, N...
## $ NONLIVINGAPARTMENTS_MEDI     <dbl> 0.0000, 0.0039, NA, NA, NA, NA, N...
## $ NONLIVINGAREA_MEDI           <dbl> 0.0000, 0.0100, NA, NA, NA, NA, N...
## $ FONDKAPREMONT_MODE           <chr> "reg oper account", "reg oper acc...
## $ HOUSETYPE_MODE               <chr> "block of flats", "block of flats...
## $ TOTALAREA_MODE               <dbl> 0.0149, 0.0714, NA, NA, NA, NA, N...
## $ WALLSMATERIAL_MODE           <chr> "Stone, brick", "Block", "", "", ...
## $ EMERGENCYSTATE_MODE          <chr> "No", "No", "", "", "", "", "", "...
## $ OBS_30_CNT_SOCIAL_CIRCLE     <dbl> 2, 1, 0, 2, 0, 0, 1, 2, 1, 2, 0, ...
## $ DEF_30_CNT_SOCIAL_CIRCLE     <dbl> 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ OBS_60_CNT_SOCIAL_CIRCLE     <dbl> 2, 1, 0, 2, 0, 0, 1, 2, 1, 2, 0, ...
## $ DEF_60_CNT_SOCIAL_CIRCLE     <dbl> 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ DAYS_LAST_PHONE_CHANGE       <dbl> -1134, -828, -815, -617, -1106, -...
## $ FLAG_DOCUMENT_2              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_3              <int> 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, ...
## $ FLAG_DOCUMENT_4              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_5              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_6              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_7              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_8              <int> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_9              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_10             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_11             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_12             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_13             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_14             <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_15             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_16             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_17             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_18             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_19             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_20             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FLAG_DOCUMENT_21             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ AMT_REQ_CREDIT_BUREAU_HOUR   <dbl> 0, 0, 0, NA, 0, 0, 0, 0, 0, NA, 0...
## $ AMT_REQ_CREDIT_BUREAU_DAY    <dbl> 0, 0, 0, NA, 0, 0, 0, 0, 0, NA, 0...
## $ AMT_REQ_CREDIT_BUREAU_WEEK   <dbl> 0, 0, 0, NA, 0, 0, 0, 0, 0, NA, 0...
## $ AMT_REQ_CREDIT_BUREAU_MON    <dbl> 0, 0, 0, NA, 0, 0, 1, 0, 0, NA, 1...
## $ AMT_REQ_CREDIT_BUREAU_QRT    <dbl> 0, 0, 0, NA, 0, 1, 1, 0, 0, NA, 0...
## $ AMT_REQ_CREDIT_BUREAU_YEAR   <dbl> 1, 0, 0, NA, 0, 1, 2, 0, 1, NA, 0...
# print first 10 rows of Train data
head(Train, n=10)
##     SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR
##  1:     100002      1         Cash loans           M            N
##  2:     100003      0         Cash loans           F            N
##  3:     100004      0    Revolving loans           M            Y
##  4:     100006      0         Cash loans           F            N
##  5:     100007      0         Cash loans           M            N
##  6:     100008      0         Cash loans           M            N
##  7:     100009      0         Cash loans           F            Y
##  8:     100010      0         Cash loans           M            Y
##  9:     100011      0         Cash loans           F            N
## 10:     100012      0    Revolving loans           M            N
##     FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY
##  1:               Y            0           202500   406597.5     24700.5
##  2:               N            0           270000  1293502.5     35698.5
##  3:               Y            0            67500   135000.0      6750.0
##  4:               Y            0           135000   312682.5     29686.5
##  5:               Y            0           121500   513000.0     21865.5
##  6:               Y            0            99000   490495.5     27517.5
##  7:               Y            1           171000  1560726.0     41301.0
##  8:               Y            0           360000  1530000.0     42075.0
##  9:               Y            0           112500  1019610.0     33826.5
## 10:               Y            0           135000   405000.0     20250.0
##     AMT_GOODS_PRICE NAME_TYPE_SUITE     NAME_INCOME_TYPE
##  1:          351000   Unaccompanied              Working
##  2:         1129500          Family        State servant
##  3:          135000   Unaccompanied              Working
##  4:          297000   Unaccompanied              Working
##  5:          513000   Unaccompanied              Working
##  6:          454500 Spouse, partner        State servant
##  7:         1395000   Unaccompanied Commercial associate
##  8:         1530000   Unaccompanied        State servant
##  9:          913500        Children            Pensioner
## 10:          405000   Unaccompanied              Working
##               NAME_EDUCATION_TYPE   NAME_FAMILY_STATUS NAME_HOUSING_TYPE
##  1: Secondary / secondary special Single / not married House / apartment
##  2:              Higher education              Married House / apartment
##  3: Secondary / secondary special Single / not married House / apartment
##  4: Secondary / secondary special       Civil marriage House / apartment
##  5: Secondary / secondary special Single / not married House / apartment
##  6: Secondary / secondary special              Married House / apartment
##  7:              Higher education              Married House / apartment
##  8:              Higher education              Married House / apartment
##  9: Secondary / secondary special              Married House / apartment
## 10: Secondary / secondary special Single / not married House / apartment
##     REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION
##  1:                   0.018801      -9461          -637             -3648
##  2:                   0.003541     -16765         -1188             -1186
##  3:                   0.010032     -19046          -225             -4260
##  4:                   0.008019     -19005         -3039             -9833
##  5:                   0.028663     -19932         -3038             -4311
##  6:                   0.035792     -16941         -1588             -4970
##  7:                   0.035792     -13778         -3130             -1213
##  8:                   0.003122     -18850          -449             -4597
##  9:                   0.018634     -20099        365243             -7427
## 10:                   0.019689     -14469         -2019            -14437
##     DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE
##  1:           -2120          NA          1              1               0
##  2:            -291          NA          1              1               0
##  3:           -2531          26          1              1               1
##  4:           -2437          NA          1              1               0
##  5:           -3458          NA          1              1               0
##  6:            -477          NA          1              1               1
##  7:            -619          17          1              1               0
##  8:           -2379           8          1              1               1
##  9:           -3514          NA          1              0               0
## 10:           -3992          NA          1              1               0
##     FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS
##  1:                1          1          0        Laborers               1
##  2:                1          1          0      Core staff               2
##  3:                1          1          0        Laborers               1
##  4:                1          0          0        Laborers               2
##  5:                1          0          0      Core staff               1
##  6:                1          1          0        Laborers               2
##  7:                1          1          0     Accountants               3
##  8:                1          0          0        Managers               2
##  9:                1          0          0                               2
## 10:                1          0          0        Laborers               1
##     REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY
##  1:                    2                           2
##  2:                    1                           1
##  3:                    2                           2
##  4:                    2                           2
##  5:                    2                           2
##  6:                    2                           2
##  7:                    2                           2
##  8:                    3                           3
##  9:                    2                           2
## 10:                    2                           2
##     WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START
##  1:                  WEDNESDAY                      10
##  2:                     MONDAY                      11
##  3:                     MONDAY                       9
##  4:                  WEDNESDAY                      17
##  5:                   THURSDAY                      11
##  6:                  WEDNESDAY                      16
##  7:                     SUNDAY                      16
##  8:                     MONDAY                      16
##  9:                  WEDNESDAY                      14
## 10:                   THURSDAY                       8
##     REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION
##  1:                          0                          0
##  2:                          0                          0
##  3:                          0                          0
##  4:                          0                          0
##  5:                          0                          0
##  6:                          0                          0
##  7:                          0                          0
##  8:                          0                          0
##  9:                          0                          0
## 10:                          0                          0
##     LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY
##  1:                           0                      0
##  2:                           0                      0
##  3:                           0                      0
##  4:                           0                      0
##  5:                           0                      0
##  6:                           0                      0
##  7:                           0                      0
##  8:                           0                      0
##  9:                           0                      0
## 10:                           0                      0
##     REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY      ORGANIZATION_TYPE
##  1:                      0                       0 Business Entity Type 3
##  2:                      0                       0                 School
##  3:                      0                       0             Government
##  4:                      0                       0 Business Entity Type 3
##  5:                      1                       1               Religion
##  6:                      0                       0                  Other
##  7:                      0                       0 Business Entity Type 3
##  8:                      1                       1                  Other
##  9:                      0                       0                    XNA
## 10:                      0                       0            Electricity
##     EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG
##  1:   0.08303697    0.2629486    0.1393758         0.0247           0.0369
##  2:   0.31126731    0.6222458           NA         0.0959           0.0529
##  3:           NA    0.5559121    0.7295667             NA               NA
##  4:           NA    0.6504417           NA             NA               NA
##  5:           NA    0.3227383           NA             NA               NA
##  6:           NA    0.3542247    0.6212263             NA               NA
##  7:   0.77476141    0.7239999    0.4920601             NA               NA
##  8:           NA    0.7142793    0.5406545             NA               NA
##  9:   0.58733405    0.2057473    0.7517237             NA               NA
## 10:           NA    0.7466436           NA             NA               NA
##     YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG
##  1:                      0.9722          0.6192         0.0143
##  2:                      0.9851          0.7960         0.0605
##  3:                          NA              NA             NA
##  4:                          NA              NA             NA
##  5:                          NA              NA             NA
##  6:                          NA              NA             NA
##  7:                          NA              NA             NA
##  8:                          NA              NA             NA
##  9:                          NA              NA             NA
## 10:                          NA              NA             NA
##     ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG
##  1:          0.00        0.0690        0.0833        0.1250       0.0369
##  2:          0.08        0.0345        0.2917        0.3333       0.0130
##  3:            NA            NA            NA            NA           NA
##  4:            NA            NA            NA            NA           NA
##  5:            NA            NA            NA            NA           NA
##  6:            NA            NA            NA            NA           NA
##  7:            NA            NA            NA            NA           NA
##  8:            NA            NA            NA            NA           NA
##  9:            NA            NA            NA            NA           NA
## 10:            NA            NA            NA            NA           NA
##     LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG
##  1:               0.0202         0.0190                  0.0000
##  2:               0.0773         0.0549                  0.0039
##  3:                   NA             NA                      NA
##  4:                   NA             NA                      NA
##  5:                   NA             NA                      NA
##  6:                   NA             NA                      NA
##  7:                   NA             NA                      NA
##  8:                   NA             NA                      NA
##  9:                   NA             NA                      NA
## 10:                   NA             NA                      NA
##     NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE
##  1:            0.0000          0.0252            0.0383
##  2:            0.0098          0.0924            0.0538
##  3:                NA              NA                NA
##  4:                NA              NA                NA
##  5:                NA              NA                NA
##  6:                NA              NA                NA
##  7:                NA              NA                NA
##  8:                NA              NA                NA
##  9:                NA              NA                NA
## 10:                NA              NA                NA
##     YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE
##  1:                       0.9722           0.6341          0.0144
##  2:                       0.9851           0.8040          0.0497
##  3:                           NA               NA              NA
##  4:                           NA               NA              NA
##  5:                           NA               NA              NA
##  6:                           NA               NA              NA
##  7:                           NA               NA              NA
##  8:                           NA               NA              NA
##  9:                           NA               NA              NA
## 10:                           NA               NA              NA
##     ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE FLOORSMIN_MODE
##  1:         0.0000         0.0690         0.0833         0.1250
##  2:         0.0806         0.0345         0.2917         0.3333
##  3:             NA             NA             NA             NA
##  4:             NA             NA             NA             NA
##  5:             NA             NA             NA             NA
##  6:             NA             NA             NA             NA
##  7:             NA             NA             NA             NA
##  8:             NA             NA             NA             NA
##  9:             NA             NA             NA             NA
## 10:             NA             NA             NA             NA
##     LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE
##  1:        0.0377                 0.022          0.0198
##  2:        0.0128                 0.079          0.0554
##  3:            NA                    NA              NA
##  4:            NA                    NA              NA
##  5:            NA                    NA              NA
##  6:            NA                    NA              NA
##  7:            NA                    NA              NA
##  8:            NA                    NA              NA
##  9:            NA                    NA              NA
## 10:            NA                    NA              NA
##     NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI
##  1:                        0                  0          0.0250
##  2:                        0                  0          0.0968
##  3:                       NA                 NA              NA
##  4:                       NA                 NA              NA
##  5:                       NA                 NA              NA
##  6:                       NA                 NA              NA
##  7:                       NA                 NA              NA
##  8:                       NA                 NA              NA
##  9:                       NA                 NA              NA
## 10:                       NA                 NA              NA
##     BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI
##  1:            0.0369                       0.9722           0.6243
##  2:            0.0529                       0.9851           0.7987
##  3:                NA                           NA               NA
##  4:                NA                           NA               NA
##  5:                NA                           NA               NA
##  6:                NA                           NA               NA
##  7:                NA                           NA               NA
##  8:                NA                           NA               NA
##  9:                NA                           NA               NA
## 10:                NA                           NA               NA
##     COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI
##  1:          0.0144           0.00         0.0690         0.0833
##  2:          0.0608           0.08         0.0345         0.2917
##  3:              NA             NA             NA             NA
##  4:              NA             NA             NA             NA
##  5:              NA             NA             NA             NA
##  6:              NA             NA             NA             NA
##  7:              NA             NA             NA             NA
##  8:              NA             NA             NA             NA
##  9:              NA             NA             NA             NA
## 10:              NA             NA             NA             NA
##     FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI
##  1:         0.1250        0.0375                0.0205          0.0193
##  2:         0.3333        0.0132                0.0787          0.0558
##  3:             NA            NA                    NA              NA
##  4:             NA            NA                    NA              NA
##  5:             NA            NA                    NA              NA
##  6:             NA            NA                    NA              NA
##  7:             NA            NA                    NA              NA
##  8:             NA            NA                    NA              NA
##  9:             NA            NA                    NA              NA
## 10:             NA            NA                    NA              NA
##     NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE
##  1:                   0.0000               0.00   reg oper account
##  2:                   0.0039               0.01   reg oper account
##  3:                       NA                 NA                   
##  4:                       NA                 NA                   
##  5:                       NA                 NA                   
##  6:                       NA                 NA                   
##  7:                       NA                 NA                   
##  8:                       NA                 NA                   
##  9:                       NA                 NA                   
## 10:                       NA                 NA                   
##     HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE EMERGENCYSTATE_MODE
##  1: block of flats         0.0149       Stone, brick                  No
##  2: block of flats         0.0714              Block                  No
##  3:                            NA                                       
##  4:                            NA                                       
##  5:                            NA                                       
##  6:                            NA                                       
##  7:                            NA                                       
##  8:                            NA                                       
##  9:                            NA                                       
## 10:                            NA                                       
##     OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE
##  1:                        2                        2
##  2:                        1                        0
##  3:                        0                        0
##  4:                        2                        0
##  5:                        0                        0
##  6:                        0                        0
##  7:                        1                        0
##  8:                        2                        0
##  9:                        1                        0
## 10:                        2                        0
##     OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE
##  1:                        2                        2
##  2:                        1                        0
##  3:                        0                        0
##  4:                        2                        0
##  5:                        0                        0
##  6:                        0                        0
##  7:                        1                        0
##  8:                        2                        0
##  9:                        1                        0
## 10:                        2                        0
##     DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4
##  1:                  -1134               0               1               0
##  2:                   -828               0               1               0
##  3:                   -815               0               0               0
##  4:                   -617               0               1               0
##  5:                  -1106               0               0               0
##  6:                  -2536               0               1               0
##  7:                  -1562               0               0               0
##  8:                  -1070               0               1               0
##  9:                      0               0               1               0
## 10:                  -1673               0               0               0
##     FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8
##  1:               0               0               0               0
##  2:               0               0               0               0
##  3:               0               0               0               0
##  4:               0               0               0               0
##  5:               0               0               0               1
##  6:               0               0               0               0
##  7:               0               0               0               1
##  8:               0               0               0               0
##  9:               0               0               0               0
## 10:               0               0               0               0
##     FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12
##  1:               0                0                0                0
##  2:               0                0                0                0
##  3:               0                0                0                0
##  4:               0                0                0                0
##  5:               0                0                0                0
##  6:               0                0                0                0
##  7:               0                0                0                0
##  8:               0                0                0                0
##  9:               0                0                0                0
## 10:               0                0                0                0
##     FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16
##  1:                0                0                0                0
##  2:                0                0                0                0
##  3:                0                0                0                0
##  4:                0                0                0                0
##  5:                0                0                0                0
##  6:                0                0                0                0
##  7:                0                1                0                0
##  8:                0                0                0                0
##  9:                0                0                0                0
## 10:                0                0                0                0
##     FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20
##  1:                0                0                0                0
##  2:                0                0                0                0
##  3:                0                0                0                0
##  4:                0                0                0                0
##  5:                0                0                0                0
##  6:                0                0                0                0
##  7:                0                0                0                0
##  8:                0                0                0                0
##  9:                0                0                0                0
## 10:                0                0                0                0
##     FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY
##  1:                0                          0                         0
##  2:                0                          0                         0
##  3:                0                          0                         0
##  4:                0                         NA                        NA
##  5:                0                          0                         0
##  6:                0                          0                         0
##  7:                0                          0                         0
##  8:                0                          0                         0
##  9:                0                          0                         0
## 10:                0                         NA                        NA
##     AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON
##  1:                          0                         0
##  2:                          0                         0
##  3:                          0                         0
##  4:                         NA                        NA
##  5:                          0                         0
##  6:                          0                         0
##  7:                          0                         1
##  8:                          0                         0
##  9:                          0                         0
## 10:                         NA                        NA
##     AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
##  1:                         0                          1
##  2:                         0                          0
##  3:                         0                          0
##  4:                        NA                         NA
##  5:                         0                          0
##  6:                         1                          1
##  7:                         1                          2
##  8:                         0                          0
##  9:                         0                          1
## 10:                        NA                         NA
skim_to_list(Train)
## $character
## # A tibble: 16 x 8
##    variable              missing complete n      min   max   empty n_unique
##  * <chr>                 <chr>   <chr>    <chr>  <chr> <chr> <chr> <chr>   
##  1 CODE_GENDER           0       307511   307511 1     3     0     3       
##  2 EMERGENCYSTATE_MODE   0       307511   307511 0     3     1457~ 3       
##  3 FLAG_OWN_CAR          0       307511   307511 1     1     0     2       
##  4 FLAG_OWN_REALTY       0       307511   307511 1     1     0     2       
##  5 FONDKAPREMONT_MODE    0       307511   307511 0     21    2102~ 5       
##  6 HOUSETYPE_MODE        0       307511   307511 0     16    1542~ 4       
##  7 NAME_CONTRACT_TYPE    0       307511   307511 10    15    0     2       
##  8 NAME_EDUCATION_TYPE   0       307511   307511 15    29    0     5       
##  9 NAME_FAMILY_STATUS    0       307511   307511 5     20    0     6       
## 10 NAME_HOUSING_TYPE     0       307511   307511 12    19    0     6       
## 11 NAME_INCOME_TYPE      0       307511   307511 7     20    0     8       
## 12 NAME_TYPE_SUITE       0       307511   307511 0     15    1292  8       
## 13 OCCUPATION_TYPE       0       307511   307511 0     21    96391 19      
## 14 ORGANIZATION_TYPE     0       307511   307511 3     22    0     58      
## 15 WALLSMATERIAL_MODE    0       307511   307511 0     12    1563~ 8       
## 16 WEEKDAY_APPR_PROCESS~ 0       307511   307511 6     9     0     7       
## 
## $integer
## # A tibble: 41 x 12
##    variable missing complete n     mean  sd    p0    p25   p50   p75  
##  * <chr>    <chr>   <chr>    <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 CNT_CHI~ 0       307511   3075~ "   ~ "   ~ 0     "   ~ 0     "   ~
##  2 DAYS_BI~ 0       307511   3075~ "-16~ "  4~ -252~ "-19~ -157~ "-12~
##  3 DAYS_EM~ 0       307511   3075~ " 63~ "141~ -179~ " -2~ -1213 "  -~
##  4 DAYS_ID~ 0       307511   3075~ " -2~ "  1~ -7197 " -4~ -3254 " -1~
##  5 FLAG_CO~ 0       307511   3075~ "   ~ "   ~ 0     "   ~ 1     "   ~
##  6 FLAG_DO~ 0       307511   3075~ "   ~ "   ~ 0     "   ~ 0     "   ~
##  7 FLAG_DO~ 0       307511   3075~ "   ~ "   ~ 0     "   ~ 0     "   ~
##  8 FLAG_DO~ 0       307511   3075~ "   ~ "   ~ 0     "   ~ 0     "   ~
##  9 FLAG_DO~ 0       307511   3075~ "   ~ "   ~ 0     "   ~ 0     "   ~
## 10 FLAG_DO~ 0       307511   3075~ "   ~ "   ~ 0     "   ~ 0     "   ~
## # ... with 31 more rows, and 2 more variables: p100 <chr>, hist <chr>
## 
## $numeric
## # A tibble: 65 x 12
##    variable missing complete n     mean  sd    p0    p25   p50   p75  
##  * <chr>    <chr>   <chr>    <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 AMT_ANN~ 12      307499   3075~ " 27~ " 14~ "  1~ " 16~ " 24~ " 34~
##  2 AMT_CRE~ 0       307511   3075~ " 6e~ " 4e~ " 45~ "270~ "513~ "808~
##  3 AMT_GOO~ 278     307233   3075~ "538~ "369~ " 40~ "238~ "450~ "679~
##  4 AMT_INC~ 0       307511   3075~ "168~ "237~ " 25~ "112~ "147~ " 2e~
##  5 AMT_REQ~ 41519   265992   3075~ "   ~ "   ~ "   ~ "   ~ "   ~ "   ~
##  6 AMT_REQ~ 41519   265992   3075~ "   ~ "   ~ "   ~ "   ~ "   ~ "   ~
##  7 AMT_REQ~ 41519   265992   3075~ "   ~ "   ~ "   ~ "   ~ "   ~ "   ~
##  8 AMT_REQ~ 41519   265992   3075~ "   ~ "   ~ "   ~ "   ~ "   ~ "   ~
##  9 AMT_REQ~ 41519   265992   3075~ "   ~ "   ~ "   ~ "   ~ "   ~ "   ~
## 10 AMT_REQ~ 41519   265992   3075~ "   ~ "   ~ "   ~ "   ~ "   ~ "   ~
## # ... with 55 more rows, and 2 more variables: p100 <chr>, hist <chr>
#Checking for and removing duplicate variables and furtunately this dataset doesnot have dulicates
Train %>% distinct()
##         SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR
##      1:     100002      1         Cash loans           M            N
##      2:     100003      0         Cash loans           F            N
##      3:     100004      0    Revolving loans           M            Y
##      4:     100006      0         Cash loans           F            N
##      5:     100007      0         Cash loans           M            N
##     ---                                                              
## 307507:     456251      0         Cash loans           M            N
## 307508:     456252      0         Cash loans           F            N
## 307509:     456253      0         Cash loans           F            N
## 307510:     456254      1         Cash loans           F            N
## 307511:     456255      0         Cash loans           F            N
##         FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT
##      1:               Y            0           202500   406597.5
##      2:               N            0           270000  1293502.5
##      3:               Y            0            67500   135000.0
##      4:               Y            0           135000   312682.5
##      5:               Y            0           121500   513000.0
##     ---                                                         
## 307507:               N            0           157500   254700.0
## 307508:               Y            0            72000   269550.0
## 307509:               Y            0           153000   677664.0
## 307510:               Y            0           171000   370107.0
## 307511:               N            0           157500   675000.0
##         AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE     NAME_INCOME_TYPE
##      1:     24700.5          351000   Unaccompanied              Working
##      2:     35698.5         1129500          Family        State servant
##      3:      6750.0          135000   Unaccompanied              Working
##      4:     29686.5          297000   Unaccompanied              Working
##      5:     21865.5          513000   Unaccompanied              Working
##     ---                                                                 
## 307507:     27558.0          225000   Unaccompanied              Working
## 307508:     12001.5          225000   Unaccompanied            Pensioner
## 307509:     29979.0          585000   Unaccompanied              Working
## 307510:     20205.0          319500   Unaccompanied Commercial associate
## 307511:     49117.5          675000   Unaccompanied Commercial associate
##                   NAME_EDUCATION_TYPE   NAME_FAMILY_STATUS
##      1: Secondary / secondary special Single / not married
##      2:              Higher education              Married
##      3: Secondary / secondary special Single / not married
##      4: Secondary / secondary special       Civil marriage
##      5: Secondary / secondary special Single / not married
##     ---                                                   
## 307507: Secondary / secondary special            Separated
## 307508: Secondary / secondary special                Widow
## 307509:              Higher education            Separated
## 307510: Secondary / secondary special              Married
## 307511:              Higher education              Married
##         NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH
##      1: House / apartment                   0.018801      -9461
##      2: House / apartment                   0.003541     -16765
##      3: House / apartment                   0.010032     -19046
##      4: House / apartment                   0.008019     -19005
##      5: House / apartment                   0.028663     -19932
##     ---                                                        
## 307507:      With parents                   0.032561      -9327
## 307508: House / apartment                   0.025164     -20775
## 307509: House / apartment                   0.005002     -14966
## 307510: House / apartment                   0.005313     -11961
## 307511: House / apartment                   0.046220     -16856
##         DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE
##      1:          -637             -3648           -2120          NA
##      2:         -1188             -1186            -291          NA
##      3:          -225             -4260           -2531          26
##      4:         -3039             -9833           -2437          NA
##      5:         -3038             -4311           -3458          NA
##     ---                                                            
## 307507:          -236             -8456           -1982          NA
## 307508:        365243             -4388           -4090          NA
## 307509:         -7921             -6737           -5150          NA
## 307510:         -4786             -2562            -931          NA
## 307511:         -1262             -5128            -410          NA
##         FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE
##      1:          1              1               0                1
##      2:          1              1               0                1
##      3:          1              1               1                1
##      4:          1              1               0                1
##      5:          1              1               0                1
##     ---                                                           
## 307507:          1              1               0                1
## 307508:          1              0               0                1
## 307509:          1              1               0                1
## 307510:          1              1               0                1
## 307511:          1              1               1                1
##         FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS
##      1:          1          0        Laborers               1
##      2:          1          0      Core staff               2
##      3:          1          0        Laborers               1
##      4:          0          0        Laborers               2
##      5:          0          0      Core staff               1
##     ---                                                      
## 307507:          0          0     Sales staff               1
## 307508:          1          0                               1
## 307509:          0          1        Managers               1
## 307510:          0          0        Laborers               2
## 307511:          1          0        Laborers               2
##         REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY
##      1:                    2                           2
##      2:                    1                           1
##      3:                    2                           2
##      4:                    2                           2
##      5:                    2                           2
##     ---                                                 
## 307507:                    1                           1
## 307508:                    2                           2
## 307509:                    3                           3
## 307510:                    2                           2
## 307511:                    1                           1
##         WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START
##      1:                  WEDNESDAY                      10
##      2:                     MONDAY                      11
##      3:                     MONDAY                       9
##      4:                  WEDNESDAY                      17
##      5:                   THURSDAY                      11
##     ---                                                   
## 307507:                   THURSDAY                      15
## 307508:                     MONDAY                       8
## 307509:                   THURSDAY                       9
## 307510:                  WEDNESDAY                       9
## 307511:                   THURSDAY                      20
##         REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION
##      1:                          0                          0
##      2:                          0                          0
##      3:                          0                          0
##      4:                          0                          0
##      5:                          0                          0
##     ---                                                      
## 307507:                          0                          0
## 307508:                          0                          0
## 307509:                          0                          0
## 307510:                          0                          0
## 307511:                          0                          0
##         LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY
##      1:                           0                      0
##      2:                           0                      0
##      3:                           0                      0
##      4:                           0                      0
##      5:                           0                      0
##     ---                                                   
## 307507:                           0                      0
## 307508:                           0                      0
## 307509:                           0                      0
## 307510:                           0                      1
## 307511:                           0                      0
##         REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY
##      1:                      0                       0
##      2:                      0                       0
##      3:                      0                       0
##      4:                      0                       0
##      5:                      1                       1
##     ---                                               
## 307507:                      0                       0
## 307508:                      0                       0
## 307509:                      1                       1
## 307510:                      1                       0
## 307511:                      1                       1
##              ORGANIZATION_TYPE EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3
##      1: Business Entity Type 3   0.08303697    0.2629486    0.1393758
##      2:                 School   0.31126731    0.6222458           NA
##      3:             Government           NA    0.5559121    0.7295667
##      4: Business Entity Type 3           NA    0.6504417           NA
##      5:               Religion           NA    0.3227383           NA
##     ---                                                              
## 307507:               Services   0.14557045    0.6816324           NA
## 307508:                    XNA           NA    0.1159921           NA
## 307509:                 School   0.74402640    0.5357218    0.2188591
## 307510: Business Entity Type 1           NA    0.5141628    0.6610235
## 307511: Business Entity Type 3   0.73445967    0.7085689    0.1139224
##         APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG
##      1:         0.0247           0.0369                      0.9722
##      2:         0.0959           0.0529                      0.9851
##      3:             NA               NA                          NA
##      4:             NA               NA                          NA
##      5:             NA               NA                          NA
##     ---                                                            
## 307507:         0.2021           0.0887                      0.9876
## 307508:         0.0247           0.0435                      0.9727
## 307509:         0.1031           0.0862                      0.9816
## 307510:         0.0124               NA                      0.9771
## 307511:         0.0742           0.0526                      0.9881
##         YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG
##      1:          0.6192         0.0143          0.00        0.0690
##      2:          0.7960         0.0605          0.08        0.0345
##      3:              NA             NA            NA            NA
##      4:              NA             NA            NA            NA
##      5:              NA             NA            NA            NA
##     ---                                                           
## 307507:          0.8300         0.0202          0.22        0.1034
## 307508:          0.6260         0.0022          0.00        0.1034
## 307509:          0.7484         0.0123          0.00        0.2069
## 307510:              NA             NA            NA        0.0690
## 307511:              NA         0.0176          0.08        0.0690
##         FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG
##      1:        0.0833        0.1250       0.0369               0.0202
##      2:        0.2917        0.3333       0.0130               0.0773
##      3:            NA            NA           NA                   NA
##      4:            NA            NA           NA                   NA
##      5:            NA            NA           NA                   NA
##     ---                                                              
## 307507:        0.6042        0.2708       0.0594               0.1484
## 307508:        0.0833        0.1250       0.0579               0.0202
## 307509:        0.1667        0.2083           NA               0.0841
## 307510:        0.0417            NA           NA                   NA
## 307511:        0.3750            NA           NA                   NA
##         LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG
##      1:         0.0190                  0.0000            0.0000
##      2:         0.0549                  0.0039            0.0098
##      3:             NA                      NA                NA
##      4:             NA                      NA                NA
##      5:             NA                      NA                NA
##     ---                                                         
## 307507:         0.1965                  0.0753            0.1095
## 307508:         0.0257                  0.0000            0.0000
## 307509:         0.9279                  0.0000            0.0000
## 307510:         0.0061                      NA                NA
## 307511:         0.0791                      NA            0.0000
##         APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE
##      1:          0.0252            0.0383                       0.9722
##      2:          0.0924            0.0538                       0.9851
##      3:              NA                NA                           NA
##      4:              NA                NA                           NA
##      5:              NA                NA                           NA
##     ---                                                               
## 307507:          0.1008            0.0172                       0.9782
## 307508:          0.0252            0.0451                       0.9727
## 307509:          0.1050            0.0894                       0.9816
## 307510:          0.0126                NA                       0.9772
## 307511:          0.0756            0.0546                       0.9881
##         YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE
##      1:           0.6341          0.0144         0.0000         0.0690
##      2:           0.8040          0.0497         0.0806         0.0345
##      3:               NA              NA             NA             NA
##      4:               NA              NA             NA             NA
##      5:               NA              NA             NA             NA
##     ---                                                               
## 307507:           0.7125          0.0172         0.0806         0.0345
## 307508:           0.6406          0.0022         0.0000         0.1034
## 307509:           0.7583          0.0124         0.0000         0.2069
## 307510:               NA              NA             NA         0.0690
## 307511:               NA          0.0178         0.0806         0.0690
##         FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE
##      1:         0.0833         0.1250        0.0377                0.0220
##      2:         0.2917         0.3333        0.0128                0.0790
##      3:             NA             NA            NA                    NA
##      4:             NA             NA            NA                    NA
##      5:             NA             NA            NA                    NA
##     ---                                                                  
## 307507:         0.4583         0.0417        0.0094                0.0882
## 307508:         0.0833         0.1250        0.0592                0.0220
## 307509:         0.1667         0.2083            NA                0.0918
## 307510:         0.0417             NA            NA                    NA
## 307511:         0.3750             NA            NA                    NA
##         LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE
##      1:          0.0198                        0             0.0000
##      2:          0.0554                        0             0.0000
##      3:              NA                       NA                 NA
##      4:              NA                       NA                 NA
##      5:              NA                       NA                 NA
##     ---                                                            
## 307507:          0.0853                        0             0.0125
## 307508:          0.0267                        0             0.0000
## 307509:          0.9667                        0             0.0000
## 307510:          0.0063                       NA                 NA
## 307511:          0.0824                       NA             0.0000
##         APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI
##      1:          0.0250            0.0369                       0.9722
##      2:          0.0968            0.0529                       0.9851
##      3:              NA                NA                           NA
##      4:              NA                NA                           NA
##      5:              NA                NA                           NA
##     ---                                                               
## 307507:          0.2040            0.0887                       0.9876
## 307508:          0.0250            0.0435                       0.9727
## 307509:          0.1041            0.0862                       0.9816
## 307510:          0.0125                NA                       0.9771
## 307511:          0.0749            0.0526                       0.9881
##         YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI
##      1:           0.6243          0.0144           0.00         0.0690
##      2:           0.7987          0.0608           0.08         0.0345
##      3:               NA              NA             NA             NA
##      4:               NA              NA             NA             NA
##      5:               NA              NA             NA             NA
##     ---                                                               
## 307507:           0.8323          0.0203           0.22         0.1034
## 307508:           0.6310          0.0022           0.00         0.1034
## 307509:           0.7518          0.0124           0.00         0.2069
## 307510:               NA              NA             NA         0.0690
## 307511:               NA          0.0177           0.08         0.0690
##         FLOORSMAX_MEDI FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI
##      1:         0.0833         0.1250        0.0375                0.0205
##      2:         0.2917         0.3333        0.0132                0.0787
##      3:             NA             NA            NA                    NA
##      4:             NA             NA            NA                    NA
##      5:             NA             NA            NA                    NA
##     ---                                                                  
## 307507:         0.6042         0.2708        0.0605                0.1509
## 307508:         0.0833         0.1250        0.0589                0.0205
## 307509:         0.1667         0.2083            NA                0.0855
## 307510:         0.0417             NA            NA                    NA
## 307511:         0.3750             NA            NA                    NA
##         LIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI
##      1:          0.0193                   0.0000             0.0000
##      2:          0.0558                   0.0039             0.0100
##      3:              NA                       NA                 NA
##      4:              NA                       NA                 NA
##      5:              NA                       NA                 NA
##     ---                                                            
## 307507:          0.2001                   0.0757             0.1118
## 307508:          0.0261                   0.0000             0.0000
## 307509:          0.9445                   0.0000             0.0000
## 307510:          0.0062                       NA                 NA
## 307511:          0.0805                       NA             0.0000
##         FONDKAPREMONT_MODE HOUSETYPE_MODE TOTALAREA_MODE
##      1:   reg oper account block of flats         0.0149
##      2:   reg oper account block of flats         0.0714
##      3:                                               NA
##      4:                                               NA
##      5:                                               NA
##     ---                                                 
## 307507:   reg oper account block of flats         0.2898
## 307508:   reg oper account block of flats         0.0214
## 307509:   reg oper account block of flats         0.7970
## 307510:                    block of flats         0.0086
## 307511:                    block of flats         0.0718
##         WALLSMATERIAL_MODE EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE
##      1:       Stone, brick                  No                        2
##      2:              Block                  No                        1
##      3:                                                               0
##      4:                                                               2
##      5:                                                               0
##     ---                                                                
## 307507:       Stone, brick                  No                        0
## 307508:       Stone, brick                  No                        0
## 307509:              Panel                  No                        6
## 307510:       Stone, brick                  No                        0
## 307511:              Panel                  No                        0
##         DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE
##      1:                        2                        2
##      2:                        0                        1
##      3:                        0                        0
##      4:                        0                        2
##      5:                        0                        0
##     ---                                                  
## 307507:                        0                        0
## 307508:                        0                        0
## 307509:                        0                        6
## 307510:                        0                        0
## 307511:                        0                        0
##         DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2
##      1:                        2                  -1134               0
##      2:                        0                   -828               0
##      3:                        0                   -815               0
##      4:                        0                   -617               0
##      5:                        0                  -1106               0
##     ---                                                                
## 307507:                        0                   -273               0
## 307508:                        0                      0               0
## 307509:                        0                  -1909               0
## 307510:                        0                   -322               0
## 307511:                        0                   -787               0
##         FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6
##      1:               1               0               0               0
##      2:               1               0               0               0
##      3:               0               0               0               0
##      4:               1               0               0               0
##      5:               0               0               0               0
##     ---                                                                
## 307507:               0               0               0               0
## 307508:               1               0               0               0
## 307509:               1               0               0               0
## 307510:               1               0               0               0
## 307511:               1               0               0               0
##         FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10
##      1:               0               0               0                0
##      2:               0               0               0                0
##      3:               0               0               0                0
##      4:               0               0               0                0
##      5:               0               1               0                0
##     ---                                                                 
## 307507:               0               1               0                0
## 307508:               0               0               0                0
## 307509:               0               0               0                0
## 307510:               0               0               0                0
## 307511:               0               0               0                0
##         FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13
##      1:                0                0                0
##      2:                0                0                0
##      3:                0                0                0
##      4:                0                0                0
##      5:                0                0                0
##     ---                                                   
## 307507:                0                0                0
## 307508:                0                0                0
## 307509:                0                0                0
## 307510:                0                0                0
## 307511:                0                0                0
##         FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16
##      1:                0                0                0
##      2:                0                0                0
##      3:                0                0                0
##      4:                0                0                0
##      5:                0                0                0
##     ---                                                   
## 307507:                0                0                0
## 307508:                0                0                0
## 307509:                0                0                0
## 307510:                0                0                0
## 307511:                0                0                0
##         FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19
##      1:                0                0                0
##      2:                0                0                0
##      3:                0                0                0
##      4:                0                0                0
##      5:                0                0                0
##     ---                                                   
## 307507:                0                0                0
## 307508:                0                0                0
## 307509:                0                0                0
## 307510:                0                0                0
## 307511:                0                0                0
##         FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR
##      1:                0                0                          0
##      2:                0                0                          0
##      3:                0                0                          0
##      4:                0                0                         NA
##      5:                0                0                          0
##     ---                                                             
## 307507:                0                0                         NA
## 307508:                0                0                         NA
## 307509:                0                0                          1
## 307510:                0                0                          0
## 307511:                0                0                          0
##         AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK
##      1:                         0                          0
##      2:                         0                          0
##      3:                         0                          0
##      4:                        NA                         NA
##      5:                         0                          0
##     ---                                                     
## 307507:                        NA                         NA
## 307508:                        NA                         NA
## 307509:                         0                          0
## 307510:                         0                          0
## 307511:                         0                          0
##         AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT
##      1:                         0                         0
##      2:                         0                         0
##      3:                         0                         0
##      4:                        NA                        NA
##      5:                         0                         0
##     ---                                                    
## 307507:                        NA                        NA
## 307508:                        NA                        NA
## 307509:                         1                         0
## 307510:                         0                         0
## 307511:                         2                         0
##         AMT_REQ_CREDIT_BUREAU_YEAR
##      1:                          1
##      2:                          0
##      3:                          0
##      4:                         NA
##      5:                          0
##     ---                           
## 307507:                         NA
## 307508:                         NA
## 307509:                          1
## 307510:                          0
## 307511:                          1
#Tackilng with cardinality
Train[ORGANIZATION_TYPE=="Business Entity Type 1" | ORGANIZATION_TYPE=="Business Entity Type 2" | ORGANIZATION_TYPE=="Business Entity Type 3"]$ORGANIZATION_TYPE <- "Business Entity"

Train[ORGANIZATION_TYPE=="Industry: type 1" | ORGANIZATION_TYPE=="Industry: type 2" | ORGANIZATION_TYPE=="Industry: type 3" | ORGANIZATION_TYPE=="Industry: type 4" | ORGANIZATION_TYPE=="Industry: type 5" | ORGANIZATION_TYPE=="Industry: type 6" | ORGANIZATION_TYPE=="Industry: type 7" | ORGANIZATION_TYPE=="Industry: type 8" | ORGANIZATION_TYPE=="Industry: type 9" | ORGANIZATION_TYPE=="Industry: type 10" | ORGANIZATION_TYPE=="Industry: type 11" | ORGANIZATION_TYPE=="Industry: type 12" | ORGANIZATION_TYPE=="Industry: type 13"]$ORGANIZATION_TYPE <- "Industry"

Train[ORGANIZATION_TYPE=="Trade: type 1" | ORGANIZATION_TYPE=="Trade: type 2" | ORGANIZATION_TYPE=="Trade: type 3" | ORGANIZATION_TYPE=="Trade: type 4" | ORGANIZATION_TYPE=="Trade: type 5" | ORGANIZATION_TYPE=="Trade: type 6" | ORGANIZATION_TYPE=="Trade: type 7"]$ORGANIZATION_TYPE <- "Trade"

Train[ORGANIZATION_TYPE=="Transport: type 1" | ORGANIZATION_TYPE=="Transport: type 2" | ORGANIZATION_TYPE=="Transport: type 3" | ORGANIZATION_TYPE=="Transport: type 4"]$ORGANIZATION_TYPE <- "Transport"

#Convert categorical to ordinal
Train[NAME_EDUCATION_TYPE=="Lower secondary"]$NAME_EDUCATION_TYPE <- '1'
Train[NAME_EDUCATION_TYPE=="Secondary / secondary special"]$NAME_EDUCATION_TYPE <- '2'
Train[NAME_EDUCATION_TYPE=="Incomplete higher"]$NAME_EDUCATION_TYPE <- '3'
Train[NAME_EDUCATION_TYPE=="Higher education"]$NAME_EDUCATION_TYPE <- '4'
Train[NAME_EDUCATION_TYPE=="Academic degree"]$NAME_EDUCATION_TYPE <- '5'
Train$NAME_EDUCATION_TYPE <- as.numeric(Train$NAME_EDUCATION_TYPE)

Calculating and checking for missing values within Train data

Is it OK to use the variables like age, gender as an feature engineered version?

missing_train <- as.data.frame(sort(sapply(Train, function(x) sum(is.na(x))),decreasing = T))                                                   
colnames(missing_train)[1] <- "Num_Missing_values"
missing_train$Percentage <- (missing_train$Num_Missing_values/nrow(Train))*100      
missing_train$Variables <- rownames(missing_train)
missing_train <- missing_train[c(3,1,2)] 
rownames(missing_train)<-c()                                        

missing_train<-missing_train%>%
  filter(Percentage>20)
 
ggplot(head(missing_train,30), aes(reorder(Variables,Percentage),Percentage,fill= Variables)) +
  geom_bar(stat="identity")+
  theme_minimal()+
  coord_flip()+
  theme( legend.position = "none")

train_less_missing = subset(Train, select = -c(COMMONAREA_AVG , COMMONAREA_MODE,COMMONAREA_MEDI, NONLIVINGAPARTMENTS_AVG,NONLIVINGAPARTMENTS_MODE,NONLIVINGAPARTMENTS_MEDI,LIVINGAPARTMENTS_AVG,  LIVINGAPARTMENTS_MODE ,LIVINGAPARTMENTS_MEDI, FLOORSMIN_AVG ,FLOORSMIN_MODE , OWN_CAR_AGE,  FLOORSMIN_MEDI, YEARS_BUILD_AVG,  YEARS_BUILD_MODE , YEARS_BUILD_MEDI , LANDAREA_AVG , LANDAREA_MODE, LANDAREA_MEDI, BASEMENTAREA_AVG ,BASEMENTAREA_MODE, BASEMENTAREA_MEDI,EXT_SOURCE_1,  NONLIVINGAREA_AVG,           NONLIVINGAREA_MODE, NONLIVINGAREA_MEDI , ELEVATORS_AVG, ELEVATORS_MODE,ELEVATORS_MEDI , APARTMENTS_AVG ,APARTMENTS_MODE,APARTMENTS_MEDI ,ENTRANCES_AVG , ENTRANCES_MODE,ENTRANCES_MEDI,LIVINGAREA_AVG ,            LIVINGAREA_MODE, LIVINGAREA_MEDI,FLOORSMAX_AVG, FLOORSMAX_MODE, FLOORSMAX_MEDI,YEARS_BEGINEXPLUATATION_AVG , YEARS_BEGINEXPLUATATION_MODE,YEARS_BEGINEXPLUATATION_MEDI,TOTALAREA_MODE, NAME_TYPE_SUITE, NAME_FAMILY_STATUS,CODE_GENDER            

) )
#Feature Creation
train_less_missing$employed_ratio_birth <- train_less_missing$DAYS_EMPLOYED / train_less_missing$DAYS_BIRTH
train_less_missing$income_ratio_credit <- train_less_missing$AMT_INCOME_TOTAL / train_less_missing$AMT_CREDIT
train_less_missing$income_ratio_famsize <- train_less_missing$AMT_INCOME_TOTAL / train_less_missing$CNT_FAM_MEMBERS
train_less_missing$income_ration_annuity <- train_less_missing$AMT_INCOME_TOTAL / train_less_missing$AMT_ANNUITY
train_less_missing$credit_ration_annuity <- train_less_missing$AMT_CREDIT /train_less_missing$AMT_ANNUITY
train_less_missing$credit_ration_goods <- train_less_missing$AMT_CREDIT / train_less_missing$AMT_GOODS_PRICE
train_less_missing$credit_minus_goods <- train_less_missing$AMT_CREDIT - train_less_missing$AMT_GOODS_PRICE
train_less_missing$reg_ration_employed <- train_less_missing$DAYS_REGISTRATION / train_less_missing$DAYS_EMPLOYED
train_less_missing$credit_ratio_annuity_ratio_employed <- train_less_missing$credit_ration_annuity / train_less_missing$DAYS_EMPLOYED
train_less_missing$reg_ratio_idpublish <- train_less_missing$DAYS_REGISTRATION / train_less_missing$DAYS_ID_PUBLISH
train_less_missing$reg_ratio_birth <- train_less_missing$DAYS_REGISTRATION / train_less_missing$DAYS_BIRTH
train_less_missing$id_ratio_birth <- train_less_missing$DAYS_ID_PUBLISH / train_less_missing$DAYS_BIRTH
train_less_missing$phone_ratio_birth <- train_less_missing$DAYS_LAST_PHONE_CHANGE / train_less_missing$DAYS_BIRTH
train_less_missing$phone_ratio_employed <- train_less_missing$DAYS_LAST_PHONE_CHANGE / train_less_missing$DAYS_EMPLOYED

train_less_missing$document_sum <- train_less_missing$FLAG_DOCUMENT_2 + train_less_missing$FLAG_DOCUMENT_3 + train_less_missing$FLAG_DOCUMENT_4 + train_less_missing$FLAG_DOCUMENT_5 + train_less_missing$FLAG_DOCUMENT_6 + train_less_missing$FLAG_DOCUMENT_7 + train_less_missing$FLAG_DOCUMENT_8 + train_less_missing$FLAG_DOCUMENT_9 + train_less_missing$FLAG_DOCUMENT_10 + train_less_missing$FLAG_DOCUMENT_11 + train_less_missing$FLAG_DOCUMENT_12 + train_less_missing$FLAG_DOCUMENT_13 + train_less_missing$FLAG_DOCUMENT_14 + train_less_missing$FLAG_DOCUMENT_15 + train_less_missing$FLAG_DOCUMENT_16 + train_less_missing$FLAG_DOCUMENT_17 + train_less_missing$FLAG_DOCUMENT_18 + train_less_missing$FLAG_DOCUMENT_19 + train_less_missing$FLAG_DOCUMENT_20 + train_less_missing$FLAG_DOCUMENT_21

train_less_missing$sum_contact <- train_less_missing$FLAG_MOBIL +train_less_missing$FLAG_EMP_PHONE + train_less_missing$FLAG_WORK_PHONE + train_less_missing$FLAG_CONT_MOBILE + train_less_missing$FLAG_PHONE + train_less_missing$FLAG_EMAIL

train_less_missing$reliability_city_in_city <- train_less_missing$REG_CITY_NOT_LIVE_CITY + train_less_missing$REG_CITY_NOT_WORK_CITY + train_less_missing$REG_REGION_NOT_LIVE_REGION + train_less_missing$REG_REGION_NOT_WORK_REGION + train_less_missing$LIVE_CITY_NOT_WORK_CITY + train_less_missing$LIVE_REGION_NOT_WORK_REGION

train_less_missing$inquiries_total_month <- train_less_missing$AMT_REQ_CREDIT_BUREAU_HOUR + train_less_missing$AMT_REQ_CREDIT_BUREAU_DAY + train_less_missing$AMT_REQ_CREDIT_BUREAU_WEEK + train_less_missing$AMT_REQ_CREDIT_BUREAU_MON

train_less_missing$credit_as_goods <- as.numeric(train_less_missing$AMT_CREDIT==train_less_missing$AMT_GOODS_PRICE)#if it is equal it means no insurance is taken
string_2_factor_names <- train_less_missing %>%
    select_if(is.character) %>%
    names()

string_2_factor_names
##  [1] "NAME_CONTRACT_TYPE"         "FLAG_OWN_CAR"              
##  [3] "FLAG_OWN_REALTY"            "NAME_INCOME_TYPE"          
##  [5] "NAME_HOUSING_TYPE"          "OCCUPATION_TYPE"           
##  [7] "WEEKDAY_APPR_PROCESS_START" "ORGANIZATION_TYPE"         
##  [9] "FONDKAPREMONT_MODE"         "HOUSETYPE_MODE"            
## [11] "WALLSMATERIAL_MODE"         "EMERGENCYSTATE_MODE"
unique_numeric_values_tbl <-train_less_missing  %>%
    select_if(is.numeric) %>%
    map_df(~ unique(.) %>% length()) %>%
    gather() %>%
    arrange(value) %>%
    mutate(key = as_factor(key))

unique_numeric_values_tbl
## # A tibble: 81 x 2
##    key                         value
##    <fct>                       <int>
##  1 TARGET                          2
##  2 FLAG_MOBIL                      2
##  3 FLAG_EMP_PHONE                  2
##  4 FLAG_WORK_PHONE                 2
##  5 FLAG_CONT_MOBILE                2
##  6 FLAG_PHONE                      2
##  7 FLAG_EMAIL                      2
##  8 REG_REGION_NOT_LIVE_REGION      2
##  9 REG_REGION_NOT_WORK_REGION      2
## 10 LIVE_REGION_NOT_WORK_REGION     2
## # ... with 71 more rows
factor_limit <- 7

num_2_factor_names <- unique_numeric_values_tbl %>%
    filter(value < factor_limit) %>%
    arrange(desc(value)) %>%
    pull(key) %>%
    as.character()

num_2_factor_names
##  [1] "AMT_REQ_CREDIT_BUREAU_HOUR"  "sum_contact"                
##  [3] "NAME_EDUCATION_TYPE"         "document_sum"               
##  [5] "REGION_RATING_CLIENT"        "REGION_RATING_CLIENT_W_CITY"
##  [7] "credit_as_goods"             "TARGET"                     
##  [9] "FLAG_MOBIL"                  "FLAG_EMP_PHONE"             
## [11] "FLAG_WORK_PHONE"             "FLAG_CONT_MOBILE"           
## [13] "FLAG_PHONE"                  "FLAG_EMAIL"                 
## [15] "REG_REGION_NOT_LIVE_REGION"  "REG_REGION_NOT_WORK_REGION" 
## [17] "LIVE_REGION_NOT_WORK_REGION" "REG_CITY_NOT_LIVE_CITY"     
## [19] "REG_CITY_NOT_WORK_CITY"      "LIVE_CITY_NOT_WORK_CITY"    
## [21] "FLAG_DOCUMENT_2"             "FLAG_DOCUMENT_3"            
## [23] "FLAG_DOCUMENT_4"             "FLAG_DOCUMENT_5"            
## [25] "FLAG_DOCUMENT_6"             "FLAG_DOCUMENT_7"            
## [27] "FLAG_DOCUMENT_8"             "FLAG_DOCUMENT_9"            
## [29] "FLAG_DOCUMENT_10"            "FLAG_DOCUMENT_11"           
## [31] "FLAG_DOCUMENT_12"            "FLAG_DOCUMENT_13"           
## [33] "FLAG_DOCUMENT_14"            "FLAG_DOCUMENT_15"           
## [35] "FLAG_DOCUMENT_16"            "FLAG_DOCUMENT_17"           
## [37] "FLAG_DOCUMENT_18"            "FLAG_DOCUMENT_19"           
## [39] "FLAG_DOCUMENT_20"            "FLAG_DOCUMENT_21"
 #library(mice)
#imputed_Data <- mice(train_less_missing, m=1, maxit=500, method='cart', seed=500)
#summary(imputed_Data)
rec_obj <- recipe(~ ., data = train_less_missing) %>%
    step_string2factor(string_2_factor_names) %>%
    step_num2factor(num_2_factor_names) %>%
    step_meanimpute(all_numeric()) %>%
    step_modeimpute(all_nominal()) %>%
    prep(stringsAsFactors = FALSE)

rec_obj
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##  predictor         93
## 
## Training data contained 307511 data points and 62364 incomplete rows. 
## 
## Operations:
## 
## Factor variables from NAME_CONTRACT_TYPE, ... [trained]
## Factor variables from AMT_REQ_CREDIT_BUREAU_HOUR, ... [trained]
## Mean Imputation for SK_ID_CURR, CNT_CHILDREN, ... [trained]
## Mode Imputation for TARGET, NAME_CONTRACT_TYPE, ... [trained]
train_new <- bake(rec_obj, train_less_missing)
glimpse(train_new)
## Observations: 307,511
## Variables: 93
## $ SK_ID_CURR                          <int> 100002, 100003, 100004, 10...
## $ TARGET                              <fct> 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ NAME_CONTRACT_TYPE                  <fct> Cash loans, Cash loans, Re...
## $ FLAG_OWN_CAR                        <fct> N, N, Y, N, N, N, Y, Y, N,...
## $ FLAG_OWN_REALTY                     <fct> Y, N, Y, Y, Y, Y, Y, Y, Y,...
## $ CNT_CHILDREN                        <int> 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ AMT_INCOME_TOTAL                    <dbl> 202500.00, 270000.00, 6750...
## $ AMT_CREDIT                          <dbl> 406597.5, 1293502.5, 13500...
## $ AMT_ANNUITY                         <dbl> 24700.5, 35698.5, 6750.0, ...
## $ AMT_GOODS_PRICE                     <dbl> 351000, 1129500, 135000, 2...
## $ NAME_INCOME_TYPE                    <fct> Working, State servant, Wo...
## $ NAME_EDUCATION_TYPE                 <fct> 2, 4, 2, 2, 2, 2, 4, 4, 2,...
## $ NAME_HOUSING_TYPE                   <fct> House / apartment, House /...
## $ REGION_POPULATION_RELATIVE          <dbl> 0.018801, 0.003541, 0.0100...
## $ DAYS_BIRTH                          <int> -9461, -16765, -19046, -19...
## $ DAYS_EMPLOYED                       <int> -637, -1188, -225, -3039, ...
## $ DAYS_REGISTRATION                   <dbl> -3648, -1186, -4260, -9833...
## $ DAYS_ID_PUBLISH                     <int> -2120, -291, -2531, -2437,...
## $ FLAG_MOBIL                          <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ FLAG_EMP_PHONE                      <fct> 1, 1, 1, 1, 1, 1, 1, 1, 0,...
## $ FLAG_WORK_PHONE                     <fct> 0, 0, 1, 0, 0, 1, 0, 1, 0,...
## $ FLAG_CONT_MOBILE                    <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ FLAG_PHONE                          <fct> 1, 1, 1, 0, 0, 1, 1, 0, 0,...
## $ FLAG_EMAIL                          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ OCCUPATION_TYPE                     <fct> Laborers, Core staff, Labo...
## $ CNT_FAM_MEMBERS                     <dbl> 1, 2, 1, 2, 1, 2, 3, 2, 2,...
## $ REGION_RATING_CLIENT                <fct> 2, 1, 2, 2, 2, 2, 2, 3, 2,...
## $ REGION_RATING_CLIENT_W_CITY         <fct> 2, 1, 2, 2, 2, 2, 2, 3, 2,...
## $ WEEKDAY_APPR_PROCESS_START          <fct> WEDNESDAY, MONDAY, MONDAY,...
## $ HOUR_APPR_PROCESS_START             <int> 10, 11, 9, 17, 11, 16, 16,...
## $ REG_REGION_NOT_LIVE_REGION          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ REG_REGION_NOT_WORK_REGION          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ LIVE_REGION_NOT_WORK_REGION         <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ REG_CITY_NOT_LIVE_CITY              <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ REG_CITY_NOT_WORK_CITY              <fct> 0, 0, 0, 0, 1, 0, 0, 1, 0,...
## $ LIVE_CITY_NOT_WORK_CITY             <fct> 0, 0, 0, 0, 1, 0, 0, 1, 0,...
## $ ORGANIZATION_TYPE                   <fct> Business Entity, School, G...
## $ EXT_SOURCE_2                        <dbl> 0.2629486, 0.6222458, 0.55...
## $ EXT_SOURCE_3                        <dbl> 0.13937578, 0.51085291, 0....
## $ FONDKAPREMONT_MODE                  <fct> reg oper account, reg oper...
## $ HOUSETYPE_MODE                      <fct> block of flats, block of f...
## $ WALLSMATERIAL_MODE                  <fct> "Stone, brick", "Block", "...
## $ EMERGENCYSTATE_MODE                 <fct> No, No, , , , , , , , , , ...
## $ OBS_30_CNT_SOCIAL_CIRCLE            <dbl> 2, 1, 0, 2, 0, 0, 1, 2, 1,...
## $ DEF_30_CNT_SOCIAL_CIRCLE            <dbl> 2, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ OBS_60_CNT_SOCIAL_CIRCLE            <dbl> 2, 1, 0, 2, 0, 0, 1, 2, 1,...
## $ DEF_60_CNT_SOCIAL_CIRCLE            <dbl> 2, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ DAYS_LAST_PHONE_CHANGE              <dbl> -1134, -828, -815, -617, -...
## $ FLAG_DOCUMENT_2                     <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_3                     <fct> 1, 1, 0, 1, 0, 1, 0, 1, 1,...
## $ FLAG_DOCUMENT_4                     <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_5                     <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_6                     <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_7                     <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_8                     <fct> 0, 0, 0, 0, 1, 0, 1, 0, 0,...
## $ FLAG_DOCUMENT_9                     <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_10                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_11                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_12                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_13                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_14                    <fct> 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ FLAG_DOCUMENT_15                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_16                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_17                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_18                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_19                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_20                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ FLAG_DOCUMENT_21                    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ AMT_REQ_CREDIT_BUREAU_HOUR          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ AMT_REQ_CREDIT_BUREAU_DAY           <dbl> 0.000000000, 0.000000000, ...
## $ AMT_REQ_CREDIT_BUREAU_WEEK          <dbl> 0.00000000, 0.00000000, 0....
## $ AMT_REQ_CREDIT_BUREAU_MON           <dbl> 0.0000000, 0.0000000, 0.00...
## $ AMT_REQ_CREDIT_BUREAU_QRT           <dbl> 0.0000000, 0.0000000, 0.00...
## $ AMT_REQ_CREDIT_BUREAU_YEAR          <dbl> 1.000000, 0.000000, 0.0000...
## $ employed_ratio_birth                <dbl> 0.06732903, 0.07086191, 0....
## $ income_ratio_credit                 <dbl> 0.4980355, 0.2087356, 0.50...
## $ income_ratio_famsize                <dbl> 202500.00, 135000.00, 6750...
## $ income_ration_annuity               <dbl> 8.198215, 7.563343, 10.000...
## $ credit_ration_annuity               <dbl> 16.46110, 36.23409, 20.000...
## $ credit_ration_goods                 <dbl> 1.158397, 1.145199, 1.0000...
## $ credit_minus_goods                  <dbl> 55597.5, 164002.5, 0.0, 15...
## $ reg_ration_employed                 <dbl> 5.72684458, 0.99831650, 18...
## $ credit_ratio_annuity_ratio_employed <dbl> -2.584161e-02, -3.050007e-...
## $ reg_ratio_idpublish                 <dbl> 1.72075472, 4.07560137, 1....
## $ reg_ratio_birth                     <dbl> 0.385582919, 0.070742619, ...
## $ id_ratio_birth                      <dbl> 0.22407779, 0.01735759, 0....
## $ phone_ratio_birth                   <dbl> 0.1198604799, 0.0493886072...
## $ phone_ratio_employed                <dbl> 1.780219780, 0.696969697, ...
## $ document_sum                        <fct> 1, 1, 0, 1, 1, 1, 2, 1, 1,...
## $ sum_contact                         <fct> 4, 4, 5, 3, 3, 5, 4, 4, 2,...
## $ reliability_city_in_city            <int> 0, 0, 0, 0, 2, 0, 0, 2, 0,...
## $ inquiries_total_month               <dbl> 0.0000000, 0.0000000, 0.00...
## $ credit_as_goods                     <fct> 0, 0, 1, 0, 1, 0, 0, 1, 0,...
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
Target_pie<-train_new%>%
group_by(TARGET)%>%
  summarize(count=n())
  
p <- plot_ly(Target_pie, labels = ~TARGET, values = ~count, type = 'pie') %>%
    layout(title = 'Target variable distribution')
  
  p
#Checking for missing values once again after fiiling them
sum(is.na(train_new))
## [1] 0
#library(h2o)  # for fitting GLRMs
#h2o.no_progress()  # turn off progress bars
#h2o.init()
#library(Boruta)
#boruta <- Boruta(TARGET~., data = train_new, doTrace = 2)
#print(boruta)
#library(caTools)
#set.seed(123)   #  set seed to ensure you always have same random numbers generated
#sample = sample.split(train_new,SplitRatio = 0.75) # splits the data in the ratio mentioned in SplitRatio. #After splitting marks these rows as logical TRUE and the the remaining are marked as logical FALSE
#train1 =subset(train_new,sample ==TRUE) # creates a training dataset named train1 with rows which are marked as TRUE
#test1=subset(train_new, sample==FALSE)
# Training data: Separate into x and y tibbles
#x_train1 <- train1 %>% select(-TARGET)
#y_train1 <- train1 %>% select(TARGET)
# Training data: Separate into x and y tibbles
#x_test1 <- test1 %>% select(-TARGET)
#y_test1 <- test1 %>% select(TARGET)
rm(rec_obj)
rm(missing_train)
rm(train_less_missing)
rm(Train)
library(h2o)  # for fitting GLRMs
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:data.table':
## 
##     hour, month, week, year
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
h2o.no_progress()  # turn off progress bars
h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         36 minutes 25 seconds 
##     H2O cluster timezone:       America/Los_Angeles 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.26.0.2 
##     H2O cluster version age:    2 months and 26 days  
##     H2O cluster name:           H2O_started_from_R_Ibragim_oeg384 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.75 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.3 (2019-03-11)
#train_h2o <- as.h2o(bind_cols(y_train1, x_train1))
#test_h2o <- as.h2o(bind_cols(y_test1, x_test1))
x_train1 <- train_new %>% select(-TARGET)
y_train1 <- train_new %>% select(TARGET)
data_h2o <- as.h2o(bind_cols(y_train1, x_train1))
splits_h2o <- h2o.splitFrame(data_h2o, ratios = c(0.7, 0.15), seed = 1234)

train_h2o <- splits_h2o[[1]]
valid_h2o <- splits_h2o[[2]]
test_h2o  <- splits_h2o[[3]]
y <- "TARGET"
x <- setdiff(names(train_h2o), y)

automl_models_h2o <- h2o.automl(
    x = x,
    y = y,
    training_frame    = train_h2o,
    validation_frame  = valid_h2o,
    leaderboard_frame = test_h2o,
    max_runtime_secs  = 90
)
automl_leader <- automl_models_h2o@leader
performance_h2o <- h2o.performance(automl_leader, newdata = test_h2o)
performance_h2o %>%
    h2o.confusionMatrix()
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.0826596335960701:
##            0    1    Error         Rate
## 0      36634 5753 0.135726  =5753/42387
## 1       2244 1469 0.604363   =2244/3713
## Totals 38878 7222 0.173471  =7997/46100
performance_h2o %>%
    h2o.auc()
## [1] 0.7165997
y <- "TARGET"
x <- setdiff(names(train_h2o), y)

rf = h2o.randomForest(x=x, y=y, 
    training_frame = train_h2o, 
                      ntrees = 5, 
                      max_depth = 3)
## Warning in .h2o.startModelJob(algo, params, h2oRestApiVersion): Dropping bad and constant columns: [FLAG_MOBIL].
performance_h2o <- h2o.performance(rf, newdata = test_h2o)
performance_h2o %>%
    h2o.confusionMatrix()
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.108712327852845:
##            0    1    Error         Rate
## 0      36737 5650 0.133296  =5650/42387
## 1       2354 1359 0.633989   =2354/3713
## Totals 39091 7009 0.173623  =8004/46100
performance_h2o %>%
    h2o.auc()
## [1] 0.6946275
summary(rf)
## Model Details:
## ==============
## 
## H2OBinomialModel: drf
## Model Key:  DRF_model_R_1571858567929_134 
## Model Summary: 
##   number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1               5                        5                 811         3
##   max_depth mean_depth min_leaves max_leaves mean_leaves
## 1         3    3.00000          8          8     8.00000
## 
## H2OBinomialMetrics: drf
## ** Reported on training data. **
## ** Metrics reported on Out-Of-Bag training samples **
## 
## MSE:  0.07231422
## RMSE:  0.268913
## LogLoss:  0.2692928
## Mean Per-Class Error:  0.3942504
## AUC:  0.6555009
## pr_auc:  0.1467019
## Gini:  0.3110017
## R^2:  0.02547541
## 
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
##             0     1    Error           Rate
## 0      143425 34155 0.192336  =34155/177580
## 1        9296  6297 0.596165    =9296/15593
## Totals 152721 40452 0.224933  =43451/193173
## 
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold    value idx
## 1                       max f1  0.101395 0.224712 178
## 2                       max f2  0.073456 0.344685 269
## 3                 max f0point5  0.133792 0.190074 100
## 4                 max accuracy  0.285150 0.918218   0
## 5                max precision  0.208608 0.293404  14
## 6                   max recall  0.031327 1.000000 399
## 7              max specificity  0.285150 0.998282   0
## 8             max absolute_mcc  0.099731 0.142670 183
## 9   max min_per_class_accuracy  0.077926 0.615982 254
## 10 max mean_per_class_accuracy  0.086025 0.619240 226
## 
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## 
## 
## 
## Scoring History: 
##             timestamp   duration number_of_trees training_rmse
## 1 2019-10-23 13:06:38  0.047 sec               0            NA
## 2 2019-10-23 13:06:38  0.760 sec               1       0.26906
## 3 2019-10-23 13:06:39  1.351 sec               2       0.26935
## 4 2019-10-23 13:06:39  1.869 sec               3       0.26964
## 5 2019-10-23 13:06:40  2.411 sec               4       0.26984
## 6 2019-10-23 13:06:41  3.211 sec               5       0.26891
##   training_logloss training_auc training_pr_auc training_lift
## 1               NA           NA              NA            NA
## 2          0.26837      0.65168         0.11107       1.91284
## 3          0.26963      0.65360         0.13520       2.49738
## 4          0.27057      0.65059         0.13637       2.60154
## 5          0.27133      0.64575         0.13612       2.70897
## 6          0.26929      0.65550         0.14670       3.51090
##   training_classification_error
## 1                            NA
## 2                       0.34122
## 3                       0.27810
## 4                       0.23603
## 5                       0.22372
## 6                       0.22493
## 
## Variable Importances: (Extract with `h2o.varimp`) 
## =================================================
## 
## Variable Importances: 
##              variable relative_importance scaled_importance percentage
## 1        EXT_SOURCE_2          461.370483          1.000000   0.358870
## 2        EXT_SOURCE_3          328.453247          0.711908   0.255482
## 3 credit_ration_goods          147.288422          0.319241   0.114566
## 4     OCCUPATION_TYPE          122.102325          0.264651   0.094975
## 5   ORGANIZATION_TYPE           44.105118          0.095596   0.034306
## 
## ---
##                 variable relative_importance scaled_importance percentage
## 86        id_ratio_birth            0.000000          0.000000   0.000000
## 87     phone_ratio_birth            0.000000          0.000000   0.000000
## 88  phone_ratio_employed            0.000000          0.000000   0.000000
## 89           sum_contact            0.000000          0.000000   0.000000
## 90 inquiries_total_month            0.000000          0.000000   0.000000
## 91       credit_as_goods            0.000000          0.000000   0.000000
gbm2 <- h2o.gbm(y = y, x = x, training_frame
= train_h2o, ntrees = 15, max_depth = 5, min_rows =
2, learn_rate = 0.01, distribution= "multinomial"
)
## Warning in .h2o.startModelJob(algo, params, h2oRestApiVersion): Dropping bad and constant columns: [FLAG_MOBIL].
performance_h2o <- h2o.performance(gbm2, newdata = test_h2o)
performance_h2o %>%
    h2o.confusionMatrix()
## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.0852021102728569:
##            0    1    Error         Rate
## 0      35612 6775 0.159837  =6775/42387
## 1       2083 1630 0.561002   =2083/3713
## Totals 37695 8405 0.192148  =8858/46100
performance_h2o %>%
    h2o.auc()
## [1] 0.7167203
#library(Matrix)
#library(MLmetrics)
#library(lightgbm)